Fit temperature models and predict growing season temperature

Author

Max Lindmark, Jan Ohlberger, Anna Gårdmark

Published

June 14, 2023

Load libraries

Code
pkgs <- c("here","tidyverse", "tidylog", "RColorBrewer", "viridis", "sdmTMB", "sdmTMBextra", "patchwork", "RCurl") 

# minpack.lm needed if using nlsLM()
if(length(setdiff(pkgs,rownames(installed.packages()))) > 0){

    install.packages(setdiff(pkgs,rownames(installed.packages())),dependencies = T)
  
  }

invisible(lapply(pkgs, library,character.only = T))
#> here() starts at /Users/maxlindmark/Dropbox/Max work/R/perch-growth
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#> 
#> Attaching package: 'tidylog'
#> 
#> 
#> The following objects are masked from 'package:dplyr':
#> 
#>     add_count, add_tally, anti_join, count, distinct, distinct_all,
#>     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
#>     full_join, group_by, group_by_all, group_by_at, group_by_if,
#>     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
#>     relocate, rename, rename_all, rename_at, rename_if, rename_with,
#>     right_join, sample_frac, sample_n, select, select_all, select_at,
#>     select_if, semi_join, slice, slice_head, slice_max, slice_min,
#>     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
#>     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
#>     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
#>     transmute_if, ungroup
#> 
#> 
#> The following objects are masked from 'package:tidyr':
#> 
#>     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
#>     spread, uncount
#> 
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> 
#> 
#> Loading required package: viridisLite
#> 
#> 
#> Attaching package: 'sdmTMBextra'
#> 
#> 
#> The following objects are masked from 'package:sdmTMB':
#> 
#>     dharma_residuals, extract_mcmc
#> 
#> 
#> 
#> Attaching package: 'RCurl'
#> 
#> 
#> The following object is masked from 'package:tidyr':
#> 
#>     complete

# devtools::install_github("seananderson/ggsidekick") # not on CRAN 
library(ggsidekick)
theme_set(theme_sleek())

# Set path:
home <- here::here()

Load cache

Code
#qwraps2::lazyload_cache_dir(path = paste0(home, "/R/analyze-data/01-fit-temp-models-predict_cache/html"))

Read data

Code
d <- read_csv(paste0(home, "/output/temp_data_for_fitting.csv"))
#> Rows: 98616 Columns: 13
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (4): area, source, db, id
#> dbl  (8): year, temp, yday, month, day, VkDag, stn_nr, section_nr
#> date (1): date
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Fit model

Code
d <- d |> mutate(area = as.factor(area),
                 source_f = as.factor(source),
                 year_f = as.factor(year))
#> mutate: converted 'area' from character to factor (0 new NA)
#>         new variable 'source_f' (factor) with 3 unique values and 0% NA
#>         new variable 'year_f' (factor) with 83 unique values and 0% NA

m <- sdmTMB(temp ~ area*year_f + source_f + s(yday, by = area, bs = "cc"), 
            data = d,
            family = student(df = 6),
            spatial = "off",
            spatiotemporal = "off",
            knots = list(yday = c(0.5, 364.5)),
            control = sdmTMBcontrol(newton_loops = 1))

Check fit

Code
sanity(m)
#> ✔ Non-linear minimizer suggests successful convergence
#> ✔ Hessian matrix is positive definite
#> ✔ No extreme or very small eigenvalues detected
#> ✔ No gradients with respect to fixed effects are >= 0.001
#> ✔ No fixed-effect standard errors are NA
#> ✔ No standard errors look unreasonably large
#> ✔ No sigma parameters are < 0.01
#> ✔ No sigma parameters are > 100
#> Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning
#> Inf
#> Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
#> -Inf
#> Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning
#> Inf
#> Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
#> -Inf

samps <- sdmTMBextra::predict_mle_mcmc(m, mcmc_iter = 201, mcmc_warmup = 200)
#> 
#> SAMPLING FOR MODEL 'tmb_generic' NOW (CHAIN 1).
#> Chain 1: 
#> Chain 1: Gradient evaluation took 0.015823 seconds
#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 158.23 seconds.
#> Chain 1: Adjust your expectations accordingly!
#> Chain 1: 
#> Chain 1: 
#> Chain 1: Iteration:   1 / 201 [  0%]  (Warmup)
#> Chain 1: Iteration:  20 / 201 [  9%]  (Warmup)
#> Chain 1: Iteration:  40 / 201 [ 19%]  (Warmup)
#> Chain 1: Iteration:  60 / 201 [ 29%]  (Warmup)
#> Chain 1: Iteration:  80 / 201 [ 39%]  (Warmup)
#> Chain 1: Iteration: 100 / 201 [ 49%]  (Warmup)
#> Chain 1: Iteration: 120 / 201 [ 59%]  (Warmup)
#> Chain 1: Iteration: 140 / 201 [ 69%]  (Warmup)
#> Chain 1: Iteration: 160 / 201 [ 79%]  (Warmup)
#> Chain 1: Iteration: 180 / 201 [ 89%]  (Warmup)
#> Chain 1: Iteration: 200 / 201 [ 99%]  (Warmup)
#> Chain 1: Iteration: 201 / 201 [100%]  (Sampling)
#> Chain 1: 
#> Chain 1:  Elapsed Time: 150.776 seconds (Warm-up)
#> Chain 1:                0.172952 seconds (Sampling)
#> Chain 1:                150.949 seconds (Total)
#> Chain 1:
mcmc_res <- residuals(m, type = "mle-mcmc", mcmc_samples = samps)

ggplot(d, aes(sample = mcmc_res)) +
  stat_qq() +
  stat_qq_line() +
  labs(y = "Sample Quantiles", x = "Theoretical Quantiles") + 
  theme(aspect.ratio = 1)

Code

ggsave(paste0(home, "/figures/supp/qq_temp.pdf"), width = 11, height = 11, units = "cm")

Predict

Code
# Make a new data frame and predict!
nd <- data.frame(expand.grid(yday = seq(min(d$yday), max(d$yday), by = 1),
                             area = unique(d$area),
                             year = unique(d$year))) |>
  mutate(source = "logger") |> 
  mutate(id = paste(year, area, sep = "_"),
         source_f = as.factor(source),
         year_f = as.factor(year)) 
#> mutate: new variable 'source' (character) with one unique value and 0% NA
#> mutate: new variable 'id' (character) with 996 unique values and 0% NA
#>         new variable 'source_f' (factor) with one unique value and 0% NA
#>         new variable 'year_f' (factor) with 83 unique values and 0% NA

# Predict
nd$pred <- predict(m, newdata = nd)$est

In order to have a lower temperature in before-nuclear times (without any data to inform that), we can use the nearby areas.. so FM informs BT prior to nuclear

Code
nd <- nd |> 
  mutate(keep = "Y",
         keep = ifelse(area == "BT" & year < 1980, "N", keep),
         keep = ifelse(area == "SI_HA" & year < 1972, "N", keep)) |>
  filter(keep == "Y")
#> mutate: new variable 'keep' (character) with 2 unique values and 0% NA
#> filter: removed 26,352 rows (7%), 338,184 rows remaining
  
nd_sub <- nd |> 
  mutate(keep = "N",
         keep = ifelse(area == "FM" & year < 1980, "Y", keep), # use FM instead of BT
         keep = ifelse(area == "SI_EK" & year < 1972, "Y", keep)) |> # use SI_EK instead of SI_HA
  filter(keep == "Y")
#> mutate: changed 311,832 values (92%) of 'keep' (0 new NA)
#> filter: removed 311,832 rows (92%), 26,352 rows remaining

# Now change the labels to BT and SI_EK...
nd_sub <- nd_sub |> 
  mutate(area = ifelse(area == "FM", "BT", "SI_HA"))
#> mutate: converted 'area' from factor to character (0 new NA)

# Bind rows and plot only the temperature series we will use for growth modelling
nd <- bind_rows(nd, nd_sub) |> select(-keep)
#> select: dropped one variable (keep)

Keep track of the years for which we have cohorts for matching with cohort data

Code
gdat <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/perch-growth/master/data/for-analysis/dat.csv")
#> Rows: 364546 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (5): age_ring, area, gear, ID, sex
#> dbl (6): length_mm, age_bc, age_catch, catch_year, cohort, final_length
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

gdat$area_cohort_age <- as.factor(paste(gdat$area, gdat$cohort, gdat$age_bc))

gdat <- gdat |>
  group_by(area_cohort_age) |> 
  filter(n() > 10)
#> group_by: one grouping variable (area_cohort_age)
#> filter (grouped): removed 5,488 rows (2%), 359,058 rows remaining

gdat <- gdat |> 
  filter(age_catch > 3) |> 
  group_by(area) |>
  summarise(min = min(cohort)) |> 
  arrange(min)
#> filter (grouped): removed 121,647 rows (34%), 237,411 rows remaining
#> group_by: one grouping variable (area)
#> summarise: now 12 rows and 2 columns, ungrouped

nd <- left_join(nd, gdat, by = "area") |>
  mutate(growth_dat = ifelse(year >= min, "Y", "N"))
#> left_join: added one column (min)
#>            > rows only in x         0
#>            > rows only in y  (      0)
#>            > matched rows     364,536
#>            >                 =========
#>            > rows total       364,536
#> mutate: new variable 'growth_dat' (character) with 2 unique values and 0% NA

Plot predictions

Code
nd |> 
  filter(growth_dat == "Y") |> 
  ggplot(aes(yday, y = year, fill = pred, color = pred)) +
    geom_raster() +
    facet_wrap(~area, ncol = 3) +
    scale_fill_viridis(option = "magma") +
    scale_color_viridis(option = "magma") +
    labs(x = "Yearday", y = "Year", color = "Predicted SST (°C)", fill = "Predicted SST (°C)")
#> filter: removed 157,380 rows (43%), 207,156 rows remaining

Detailed exploration of predictions

Code
# Loop trough all areas, plot temperature as a function of yday, color by data source, facet by year

for(i in unique(nd$area)) {
  
  plotdat <- nd |> filter(area == i)
  
  print(
    ggplot(plotdat, aes(yday, pred, linetype = "Model prediction (logger)")) + 
      scale_color_brewer(palette = "Accent") + 
      facet_wrap(~year) + 
      geom_point(data = filter(d, area == i & year > min(plotdat$year)), aes(yday, temp, color = source),
                 size = 0.75) + 
      geom_line() + 
      labs(title = paste("Area = ", i), color = "", linetype = "") + 
      guides(color = guide_legend(title.position="top", title.hjust = 0.5)) + 
      theme_sleek(base_size = 8) +
      theme(legend.position = c(0.7, 0.03), 
            legend.direction = "horizontal",
            legend.spacing.y = unit(-0.3, "cm")) + 
      labs(x = "Day of the year", y = "Predicted SST (°C)")
  )
  
  ggsave(paste0(home, "/figures/supp/temp_pred_yday_area_", i, ".pdf" ), width = 17, height = 17, units = "cm")
  
}
#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 95,167 rows (97%), 3,449 rows remaining
#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 89,164 rows (90%), 9,452 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 89,155 rows (90%), 9,461 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 86,660 rows (88%), 11,956 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 90,466 rows (92%), 8,150 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 88,494 rows (90%), 10,122 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 87,743 rows (89%), 10,873 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 93,808 rows (95%), 4,808 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 89,755 rows (91%), 8,861 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 82,797 rows (84%), 15,819 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 95,212 rows (97%), 3,404 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 96,499 rows (98%), 2,117 rows remaining

Plot summarized data and predictions

Code
dsum <- d |> 
  filter(yday %in% c(yday("2000-03-01"):yday("2000-10-01"))) |> 
  group_by(year, area, source) |> 
  summarise(temp = mean(temp)) |> 
  mutate(type = "data")
#> filter: removed 29,555 rows (30%), 69,061 rows remaining
#> group_by: 3 grouping variables (year, area, source)
#> summarise: now 1,635 rows and 4 columns, 2 group variables remaining (year, area)
#> mutate (grouped): new variable 'type' (character) with one unique value and 0% NA

preds <- nd |> 
  filter(growth_dat == "Y" & yday %in% c(yday("2000-03-01"):yday("2000-10-01")) & source == "logger") |> 
  group_by(area, year) |> 
  summarise(temp = mean(pred)) |> 
  mutate(type = "model")
#> filter: removed 242,846 rows (67%), 121,690 rows remaining
#> group_by: 2 grouping variables (area, year)
#> summarise: now 566 rows and 3 columns, one group variable remaining (area)
#> mutate (grouped): new variable 'type' (character) with one unique value and 0% NA

ggplot(preds, aes(year, temp)) + 
  geom_point(data = dsum, aes(year, temp, color = source), size = 0.75, alpha = 0.75) + 
  scale_color_brewer(palette = "Accent") +
  geom_line(linewidth = 0.5, color = "grey20") + 
  facet_wrap(~area)

Make final plot

Code
order <- preds |>
  group_by(area) |> 
  summarise(mean_temp = mean(temp)) |> 
  arrange(desc(mean_temp))
#> group_by: one grouping variable (area)
#> summarise: now 12 rows and 2 columns, ungrouped

order
#> # A tibble: 12 × 2
#>    area  mean_temp
#>    <chr>     <dbl>
#>  1 SI_HA     17.1 
#>  2 BT        15.4 
#>  3 TH        13.1 
#>  4 SI_EK     12.3 
#>  5 FM        12.1 
#>  6 VN        11.9 
#>  7 JM        11.7 
#>  8 MU        11.4 
#>  9 FB        11.0 
#> 10 BS        10.5 
#> 11 HO        10.3 
#> 12 RA         9.67
# Save plot order..

topt <- 19.6 # Overall t_opt from 02-fit-vbge.qmd! Update if needed

ggplot(preds, aes(year, temp, color = temp)) + 
  facet_wrap(~factor(area, levels = order$area)) + 
  geom_hline(yintercept = topt, linewidth = 0.3, linetype = 2, color = "grey") +
  geom_line() +
  labs(x = "Year", y = "Model predicted average growing season temperature") + 
  scale_color_viridis(option = "magma", name = "Area") +
  guides(color = "none") 

Code

preds |> 
  group_by(area) |> 
  summarise(min = min(year),
            max = max(year)) |> 
  arrange(min)
#> group_by: one grouping variable (area)
#> summarise: now 12 rows and 3 columns, ungrouped
#> # A tibble: 12 × 3
#>    area    min   max
#>    <chr> <dbl> <dbl>
#>  1 JM     1953  2022
#>  2 FM     1963  2022
#>  3 SI_HA  1966  2022
#>  4 SI_EK  1968  2022
#>  5 FB     1969  2022
#>  6 BT     1971  2022
#>  7 MU     1981  2022
#>  8 HO     1982  2022
#>  9 BS     1985  2022
#> 10 RA     1985  2022
#> 11 VN     1987  2022
#> 12 TH     2000  2022

ggsave(paste0(home, "/figures/growing_season_temp.pdf"), width = 17, height = 17, units = "cm")
Code
# Save prediction df

write_csv(preds, paste0(home, "/output/gam_predicted_temps.csv"))

If we want to get uncertainty, we can use nsim instead; this simulates from the linear predictor using the inverse precision matrix, which is a fast way to get a distribution of samples from which we can take e.g. quantiles and means. However, it’s still slow, so the code below isn’t executed yet.

Code
nd_sim <- data.frame(expand.grid(yday = seq(min(d$yday), max(d$yday), by = 1),
                                 area = unique(d$area),
                                 year = unique(d$year))) |>
  mutate(source = "logger") |>
  mutate(id = paste(year, area, sep = "_"),
         source_f = as.factor(source),
         year_f = as.factor(year))

# Trim!
nd_sim <- left_join(nd_sim, gdat, by = "area")

nd_sim <- nd_sim |>
  mutate(growth_dat = ifelse(year > min, "Y", "N")) |>
  filter(growth_dat == "Y") |>
  filter(yday %in% c(yday("2000-05-01"):yday("2000-9-01"))) |>
  mutate(area = as.factor(area))

# Predict!
nsim <- 500
m_pred_sims <- predict(m, newdata = nd_sim, nsim = nsim)

# Join sims with prediction data
nd_sim_long <- cbind(nd_sim, as.data.frame(m_pred_sims)) %>%
    pivot_longer(c((ncol(nd_sim) + 1):(nsim + ncol(nd_sim))))

# Summarize sims over growing season
sum_pred_gs <- nd_sim_long |>
    ungroup() |>
    group_by(year, area) |>
    summarise(lwr = quantile(value, prob = 0.1),
              est = quantile(value, prob = 0.5),
              upr = quantile(value, prob = 0.9)) |>
    ungroup()

# In order to have a lower temperature in before-nuclear times (without any data to inform that), we can use the nearby areas..
sum_pred_gs <- preds |>
  mutate(keep = "Y",
         keep = ifelse(area == "BT" & year < 1980, "N", keep),
         keep = ifelse(area == "SI_HA" & year < 1972, "N", keep)) |>
  filter(keep == "Y")

sum_pred_gs_sub <- preds |>
  mutate(keep = "N",
         keep = ifelse(area == "FM" & year < 1980, "Y", keep), # use FM instead of BT
         keep = ifelse(area == "SI_EK" & year < 1972, "Y", keep)) |> # use SI_EK instead of SI_HA
  filter(keep == "Y")

# Now change the labels to BT and SI_EK...
sum_pred_gs_sub <- sum_pred_gs_sub |>
  mutate(area = ifelse(area == "FM", "BT", "SI_HA"))

# Bind rows and plot only the temperature series we will use for growth modelling
sum_pred_gs <- bind_rows(sum_pred_gs, sum_pred_gs_sub) |> select(-keep, -type)

order <- sum_pred_gs |>
  group_by(area) |>
  summarise(mean_temp = mean(temp)) |>
  arrange(desc(mean_temp))